---
title: "Replication of Table 2 in Muris (2019, Section 6, Empirical application)"
output: html_notebook
---

## Introduction

This document reproduces the results in Table 2 of the empirical application of the incomplete data paper. 

The structure of this document follows very closely that in `02-replication_incomplete.Rmd`. For this reason, the narrative and comments will be minimal: it should be easy to follow after seeing the previous file. If not, do not hesitate to contact me.

Start by load the required packages.

```{r}
library(tidyverse)
library(haven)
library(gmm)
library(mgcv)
library(Matrix)
```

Now load the data.

```{r}
(india <- read_dta("trade_liberalization.dta"))
```

We modify the data to include only the variables we need; to be tidy; and to be in the cross-sectional format we need.

```{r}
(wide_D_india <- india %>% 
  select(
    companyname, yr, tfp1000, tariff, lagtariff
  ) %>% 
  na.omit() %>% 
  gather(`tfp1000`, `tariff`, `lagtariff`, key = variable, value = value) %>% 
  unite(varname,variable,yr) %>% 
  spread(varname,value) %>% 
  transmute(
    
    Y94 = tfp1000_1994,
    Y93 = tfp1000_1993,
    Y92 = tfp1000_1992,
    Y91 = tfp1000_1991,
    Y90 = tfp1000_1990,
    Y89 = tfp1000_1989,
    
    dY96 = tfp1000_1996 - tfp1000_1995,
    dY95 = tfp1000_1995 - tfp1000_1994,
    dY94 = tfp1000_1994 - tfp1000_1993,
    dY93 = tfp1000_1993 - tfp1000_1992,
    dY92 = tfp1000_1992 - tfp1000_1991,
    dY91 = tfp1000_1991 - tfp1000_1990,
    dY90 = tfp1000_1990 - tfp1000_1989,
    
    dX96 = lagtariff_1996 - lagtariff_1995,
    dX95 = lagtariff_1995 - lagtariff_1994,
    dX94 = lagtariff_1994 - lagtariff_1993,
    dX93 = lagtariff_1993 - lagtariff_1992,
    dX92 = lagtariff_1992 - lagtariff_1991,
    dX91 = lagtariff_1991 - lagtariff_1990,
    dX90 = lagtariff_1990 - lagtariff_1989
  ))
```

Recall that `gmm` expects the data in matrix format.

```{r}
Z <- as.matrix(wide_D_india)
```

## Dynamic panel

The Arellano-Bond moment conditions are coded in the following chunk.

```{r}
topkha_ab <- function(theta,data,case){
  
  # Regression coefficient.
  beta = theta[1]
  rho = theta[2]
  
  # Difference in time dummies: will be constant per thing.
  dl91 = theta[3]
  dl92 = theta[4]
  dl93 = theta[5]
  dl94 = theta[6]
  dl95 = theta[7]
  dl96 = theta[8]
  
  dU96 = data[,"dY96"] - beta*data[,"dX96"] - rho*data[,"dY95"] - dl96
  dU95 = data[,"dY95"] - beta*data[,"dX95"] - rho*data[,"dY94"] - dl95
  dU94 = data[,"dY94"] - beta*data[,"dX94"] - rho*data[,"dY93"] - dl94
  dU93 = data[,"dY93"] - beta*data[,"dX93"] - rho*data[,"dY92"] - dl93
  dU92 = data[,"dY92"] - beta*data[,"dX92"] - rho*data[,"dY91"] - dl92
  dU91 = data[,"dY91"] - beta*data[,"dX91"] - rho*data[,"dY90"] - dl91

  moments <- cbind(
    
    # Lagged dependent
    data[,"Y94"]*dU96,
    data[,"Y93"]*dU96,
    data[,"Y92"]*dU96,
    data[,"Y91"]*dU96,
    data[,"Y90"]*dU96,
    data[,"Y89"]*dU96,
    
    data[,"Y93"]*dU95,
    data[,"Y92"]*dU95,
    data[,"Y91"]*dU95,
    data[,"Y90"]*dU95,
    data[,"Y89"]*dU95,
    
    data[,"Y92"]*dU94,
    data[,"Y91"]*dU94,
    data[,"Y90"]*dU94,
    data[,"Y89"]*dU94,
    
    data[,"Y91"]*dU93,
    data[,"Y90"]*dU93,
    data[,"Y89"]*dU93,
    
    data[,"Y90"]*dU92,
    data[,"Y89"]*dU92,
    
    data[,"Y89"]*dU91,
    
    # Exogenous covariates
    data[,"dX96"]*dU96,
    data[,"dX95"]*dU95,
    data[,"dX94"]*dU94,
    data[,"dX93"]*dU93,
    data[,"dX92"]*dU92,
    data[,"dX91"]*dU91,
    
    # Constant
    dU96,
    dU95,
    dU94,
    dU93,
    dU92,
    dU91
  )
  
  # Available case fix:
  if(case == "available"){
    moments[is.na(moments)] = 0
  }
  if(case == "complete"){
    moments <- moments[complete.cases(moments),]
  }
  
  # Return
  return(moments)
  
}

topkha_ab_cc <- function(theta,data){topkha_ab(theta,data,"complete")}
topkha_ab_ac <- function(theta,data){topkha_ab(theta,data,"available")}
```

Pass to `gmm`.

```{r}
out_cc_ab <- gmm(g = topkha_ab_cc, x = Z, t0 = numeric(8), 
                        type = "twoStep", vcov = "iid",
                        optfct = "nlminb")
out_ac_ab <- gmm(g = topkha_ab_ac, x = Z, t0 = numeric(8), 
                        type = "twoStep", vcov = "iid",
                        optfct = "nlminb")

summary(out_cc_ab)
summary(out_ac_ab)
```

## Incomplete data estimator

The following implementation uses only strata with at least 20 firms. It wraps the procedure for the static case into one function.

```{r}
incomplete_ab <- function(XX, start = numeric(8)){
  
  # What are the patterns?
  missings <- topkha_ab(numeric(8)+0.024879,XX,"case")
  missings[!is.na(missings)] = 1
  missings[is.na(missings)] = 0
  
  # Now extract the unique rows.
  patterns <- uniquecombs(missings,ordered=FALSE)
  
  # The following function will be used to create an index for which observations has which pattern.
  which_pat <- function(x,patterns){
    index <- NA
    J <- nrow(patterns)
    for(j in 1:J){
      if(min(x == patterns[j,])){
        index <- j
      }
    }
    return(index)
  }
  # Build the index
  strata <- apply(missings,1,which_pat,patterns = patterns)
  
  # Select the patterns that occur at least once.
  step1 <- as_tibble(strata) %>% group_by(value) %>% summarize(count = n()) %>% arrange(desc(count)) %>% filter(count>20)
  use_these_patterns <- step1$value

  # The following function returns a set of moment conditions for a given stratum.
  topkha_ab_stratum <- function(stratum,theta,data){
    
    bare <- topkha_ab(theta,data[strata == stratum,],"case")
    return(
      bare[,patterns[stratum,] == 1]
    )
  }
  
  # Stack 'em
  topkha_ab_efficient <- function(theta,data){
    all_moments <- bdiag(use_these_patterns %>% map(topkha_ab_stratum,theta = theta, data = data))

    return(as.matrix(all_moments))
  }
  
  out <- gmm(g = topkha_ab_efficient, x = XX, t0 = start,
                        type = "twoStep", vcov = "iid",
                        optfct = "nlminb")

  return(out)
  
}
```

Alternatively, can add the smaller strata as one additional stratum. This is not efficient, but more efficient than leaving them out. Basically, this uses available case reasoning to combine the information. without the need to estimate the weight matrix.

```{r}
incomplete_ab_plus <- function(XX, start = numeric(8)){
  
  # What are the patterns?
  missings <- topkha_ab(numeric(8)+0.024879,XX,"case")
  missings[!is.na(missings)] = 1
  missings[is.na(missings)] = 0
  
  # Now extract the unique rows.
  patterns <- uniquecombs(missings,ordered=FALSE)
  
  # The following function will be used to create an index for which observations has which pattern.
  which_pat <- function(x,patterns){
    index <- NA
    J <- nrow(patterns)
    for(j in 1:J){
      if(min(x == patterns[j,])){
        index <- j
      }
    }
    return(index)
  }
  # Build the index
  strata <- apply(missings,1,which_pat,patterns = patterns)
  
  # Select the patterns that occur at least once.
  step1 <- as_tibble(strata) %>% group_by(value) %>% summarize(count = n()) %>% arrange(desc(count)) %>% filter(count>20)
  use_these_patterns <- step1$value

  # The following function returns a set of moment conditions for a given stratum.
  topkha_ab_stratum <- function(stratum,theta,data){
    
    bare <- topkha_ab(theta,data[strata == stratum,],"case")
    return(
      bare[,patterns[stratum,] == 1]
    )
  }
  
  # Stack 'em
  topkha_ab_efficient <- function(theta,data){
    
    # This line takes all the observation with patterns selected above, stacks their moment conditions.
    all_moments <- bdiag(use_these_patterns %>% map(topkha_ab_stratum,theta = theta, data = data))
    
    # Grab one set of moment conditions from the smaller patterns.
    the_others <- topkha_ab_ac(theta,  data[ !( strata %in% use_these_patterns ) , ])
    return(as.matrix(bdiag(all_moments,the_others)))
  }
  
  out <- gmm(g = topkha_ab_efficient, x = XX, t0 = start,
                        type = "twoStep", vcov = "iid",
                        optfct = "nlminb")

  return(out)
  
}
```

Call the two functions that implement the procedures in the paper.

```{r}
what <- incomplete_ab(Z)
what_plus <- incomplete_ab_plus(Z)
```

Compare all results: 

1. complete case
2. available case
3. efficient estimator with strata with > 20 firms
4. efficient estimator with small strata collapsed into one

```{r}
cbind(out_cc_ab$coefficients,out_ac_ab$coefficients,what$coefficients,what_plus$coefficients)
```

The effect of using the additional strata is small.

